HW 02

Author

Weston Scott

1 - A new day, a new plot, a new geom

edibnb <- dsbox::edibnb
glimpse(edibnb)
Rows: 13,245
Columns: 10
$ id                   <dbl> 15420, 24288, 38628, 44552, 47616,…
$ price                <dbl> 80, 115, 46, 32, 100, 71, 175, 150…
$ neighbourhood        <chr> "New Town", "Southside", NA, "Leit…
$ accommodates         <dbl> 2, 4, 2, 2, 2, 3, 5, 5, 6, 10, 2, …
$ bathrooms            <dbl> 1.0, 1.5, 1.0, 1.0, 1.0, 1.0, 1.0,…
$ bedrooms             <dbl> 1, 2, 0, 1, 1, 1, 2, 3, 4, 4, 1, 1…
$ beds                 <dbl> 1, 2, 2, 1, 1, 2, 3, 4, 5, 7, 1, 1…
$ review_scores_rating <dbl> 99, 92, 94, 93, 98, 97, 100, 92, 9…
$ number_of_reviews    <dbl> 283, 199, 52, 184, 32, 762, 7, 28,…
$ listing_url          <chr> "https://www.airbnb.com/rooms/1542…
summary(edibnb)
       id               price        neighbourhood     
 Min.   :   15420   Min.   :  0.00   Length:13245      
 1st Qu.:13279107   1st Qu.: 49.00   Class :character  
 Median :20171841   Median : 75.00   Mode  :character  
 Mean   :20077242   Mean   : 97.21                     
 3rd Qu.:27397925   3rd Qu.:110.00                     
 Max.   :36066014   Max.   :999.00                     
                    NA's   :199                        
  accommodates      bathrooms        bedrooms     
 Min.   : 1.000   Min.   :0.000   Min.   : 0.000  
 1st Qu.: 2.000   1st Qu.:1.000   1st Qu.: 1.000  
 Median : 3.000   Median :1.000   Median : 1.000  
 Mean   : 3.541   Mean   :1.226   Mean   : 1.583  
 3rd Qu.: 4.000   3rd Qu.:1.000   3rd Qu.: 2.000  
 Max.   :19.000   Max.   :9.000   Max.   :13.000  
                  NA's   :12      NA's   :4       
      beds        review_scores_rating number_of_reviews
 Min.   : 0.000   Min.   : 20.00       Min.   :  0.00   
 1st Qu.: 1.000   1st Qu.: 93.00       1st Qu.:  2.00   
 Median : 2.000   Median : 97.00       Median : 12.00   
 Mean   : 2.032   Mean   : 95.02       Mean   : 37.73   
 3rd Qu.: 3.000   3rd Qu.: 99.00       3rd Qu.: 45.00   
 Max.   :30.000   Max.   :100.00       Max.   :773.00   
 NA's   :15       NA's   :2177                          
 listing_url       
 Length:13245      
 Class :character  
 Mode  :character  
                   
                   
                   
                   
edibnb <- edibnb |>
    mutate(
        neighbourhood = fct_reorder(neighbourhood, 
                                    review_scores_rating, 
                                    .fun = median)
    ) |>
    filter(!is.na(neighbourhood))
ggplot(
    data = edibnb, 
    aes(
        x = review_scores_rating, 
        y = neighbourhood, 
        fill = neighbourhood
    )
) +
    
geom_density_ridges(
    scale = 2,
    rel_min_height = 0.01,
    legend.show = FALSE,
    alpha = 0.8
) +

scale_fill_viridis_d(
    option = "C", 
    begin = 0.1, 
    end = 0.9
) +

scale_y_discrete(expand = c(0, 0)) + 
scale_x_continuous(
    expand = c(0, 0.25),
    limits = c(90, 100)
) + 

coord_cartesian(clip = "off") +
labs(
    title = "Problem 1 - Ridgeline plot", 
    subtitle = "Airbnb listings: Edinburgh, Scotland",
    x = "Review Score Ratings",
    y = "Edinburgh\nNeighborhoods", 
    caption = "Source: Opensource dataset dsbox::edibnb"
) +

theme_ridges() +
theme(legend.position = "none")

Intepretation

The ridgeline plot above visualizes the distribution of Airbnb review scores across different Edinburgh neighborhoods, ordered by their respective median review scores. The neighborhoods with the highest overall reviews appear at the top of the plot with a descending order down the plot to the neighborhoods with the lowest review scores. Most review scores cluster tightly between 90 and 100 (x-axis), suggesting generally positive experiences overall in the set of reviews. However, some neighborhoods display broader distributions or lower medians. A broader distribution indicates that the reviews have a larger spread along the review spectrum.

2 - Foreign Connected PACs

# get a list of files with "Foreign Connected PAC" in their names
list_of_files <- dir_ls(path = "data", regexp = "Foreign Connected PAC")

# read all files and row bind them
# keeping track of the file name in a new column called year
pac <- read_csv(list_of_files, id = "year")
glimpse(pac)
Rows: 2,394
Columns: 6
$ year                               <chr> "data/Foreign Connec…
$ `PAC Name (Affiliate)`             <chr> "7-Eleven", "ABB Gro…
$ `Country of Origin/Parent Company` <chr> "Japan/Ito-Yokado", …
$ Total                              <chr> "$8500", "$46000", "…
$ Dems                               <chr> "$1500", "$17000", "…
$ Repubs                             <chr> "$7000", "$28500", "…
pac <- pac |>
    clean_names() |>
    separate(
        country_of_origin_parent_company,
        into = c("country", "parent_company"),
        sep = "/", 
        remove = TRUE) |>

    mutate(
        year = str_extract(year, "\\d{4}-\\d{4}"),
        year = str_extract(year, "\\d{4}$"),
        year = as.integer(year)
    ) |>

    select(-total)

pac
# A tibble: 2,394 × 6
    year pac_name_affiliate   country parent_company dems  repubs
   <int> <chr>                <chr>   <chr>          <chr> <chr> 
 1  2000 7-Eleven             Japan   Ito-Yokado     $1500 $7000 
 2  2000 ABB Group            Switze… Asea Brown Bo… $170… $28500
 3  2000 Accenture            UK      Accenture plc  $230… $52984
 4  2000 ACE INA              UK      ACE Group      $125… $26000
 5  2000 Acuson Corp (Siemen… Germany Siemens AG     $2000 $0    
 6  2000 Adtranz (DaimlerChr… Germany DaimlerChrysl… $100… $500  
 7  2000 AE Staley Manufactu… UK      Tate & Lyle    $100… $14000
 8  2000 AEGON USA (AEGON NV) Nether… Aegon NV       $105… $47750
 9  2000 AIM Management Group UK      AMVESCAP       $100… $15000
10  2000 Air Liquide America  France  L'Air Liquide… $0    $0    
# ℹ 2,384 more rows
pac <- pac |>
    pivot_longer(
        cols = c(dems, repubs),
        names_to = "party",
        values_to = "amount"
    ) |>

    mutate(
        amount = str_remove(amount, "\\$"),
        amount = as.integer(amount)
    )

pac
# A tibble: 4,788 × 6
    year pac_name_affiliate   country parent_company party amount
   <int> <chr>                <chr>   <chr>          <chr>  <int>
 1  2000 7-Eleven             Japan   Ito-Yokado     dems    1500
 2  2000 7-Eleven             Japan   Ito-Yokado     repu…   7000
 3  2000 ABB Group            Switze… Asea Brown Bo… dems   17000
 4  2000 ABB Group            Switze… Asea Brown Bo… repu…  28500
 5  2000 Accenture            UK      Accenture plc  dems   23000
 6  2000 Accenture            UK      Accenture plc  repu…  52984
 7  2000 ACE INA              UK      ACE Group      dems   12500
 8  2000 ACE INA              UK      ACE Group      repu…  26000
 9  2000 Acuson Corp (Siemen… Germany Siemens AG     dems    2000
10  2000 Acuson Corp (Siemen… Germany Siemens AG     repu…      0
# ℹ 4,778 more rows
uk_spending <- pac |>
    filter(country == "UK") |>
    group_by(year, party) |>

    summarise(
        totals = sum(amount[amount != 0],
                     na.rm = TRUE), 
        .groups = "drop") |>

    arrange(year, party)

uk_spending
# A tibble: 24 × 3
    year party   totals
   <int> <chr>    <int>
 1  2000 dems    975725
 2  2000 repubs 2057518
 3  2002 dems   1046183
 4  2002 repubs 2002772
 5  2004 dems   1188801
 6  2004 repubs 2311101
 7  2006 dems   1543755
 8  2006 repubs 3057736
 9  2008 dems   2690413
10  2008 repubs 2842956
# ℹ 14 more rows
ggplot(data = uk_spending, 
       aes(x = year, y = totals / 1e6, color = party)) +

geom_line() +
scale_color_manual(values = c("repubs" = "red", 
                              "dems" = "blue"),
                  labels = c("Democrats", "Republicans")) +
scale_x_continuous(breaks = seq(1996, 2022, by = 4),
                   labels = seq(1996, 2022, by = 4)) +
scale_y_continuous(labels = function(x) paste0("$", round(x), "M")) +

labs(
    title = "Contributions to US political parties from UK-connected PACs",
    color = "Party",
    x = "Year",
    y = "Total amount",
    caption = "Source: OpenSecrets.org"
) +

theme(
    axis.title.y = element_text(margin = margin(t = 0, r = 0, 
                                                b = 0, l = 0),
                                hjust = 0),
    axis.title.x = element_text(margin = margin(t = 0, r = 0, 
                                                b = 0, l = 0), 
                                hjust = 0),
    legend.position = c(0.87, 0.15)
)

swiss_spending <- pac |>
    filter(country == "Switzerland") |>
    group_by(year, party) |>

    summarise(
        totals = sum(amount[amount != 0],
                     na.rm = TRUE), 
        .groups = "drop") |>

    arrange(year, party)


ggplot(data = swiss_spending, 
       aes(x = year, y = totals / 1e6, color = party)) +

geom_line() +
scale_color_manual(values = c("repubs" = "red", 
                              "dems" = "blue"),
                  labels = c("Democrats", "Republicans")) +
scale_x_continuous(breaks = seq(1996, 2022, by = 4),
                   labels = seq(1996, 2022, by = 4)) +
scale_y_continuous(labels = function(x) paste0("$", round(x), "M")) +

labs(
    title = "Contributions to US political parties from Swiss-connected PACs",
    color = "Party",
    x = "Year",
    y = "Total amount",
    caption = "Source: OpenSecrets.org"
) +

theme(
    axis.title.y = element_text(margin = margin(t = 0, r = 0, 
                                                b = 0, l = 0), 
                                hjust = 0),
    axis.title.x = element_text(margin = margin(t = 0, r = 0, 
                                                b = 0, l = 0), 
                                hjust = 0),
    legend.position = c(0.87, 0.15)
)

Intepretation

Contributions from Swiss-connected PACs to U.S. political parties have grown since the year 2000. The peak around key election years. The data shows a clear preference for Republican candidates, especially from 2008 onward. This might reflect Swiss alignment of ideologies or policies with Republican platforms. In contrast, Democratic contributions also grew, though they remained more modest and stable over the yearly span of this dataset.

3 - Median housing prices in the US

median_housing <- read_csv("data/median-housing.csv")

median_housing <- median_housing |>
  rename(date = DATE) |>
  rename(price = MSPUS)
glimpse(median_housing)
Rows: 234
Columns: 2
$ date  <date> 1963-01-01, 1963-04-01, 1963-07-01, 1963-10-01, …
$ price <dbl> 17800, 18000, 17900, 18500, 18500, 18900, 18900, …
recessions <- read_csv("data/recessions.csv")
glimpse(recessions)
Rows: 34
Columns: 2
$ Peak   <date> 1857-06-01, 1860-10-01, 1865-04-01, 1869-06-01,…
$ Trough <date> 1858-12-01, 1861-06-01, 1867-12-01, 1870-12-01,…
ggplot(data = median_housing, 
       aes(x = date, y = price)) +

geom_line(color = "darkblue") +
scale_x_date(breaks = seq(as.Date("1960-01-01"), 
                          as.Date("2020-01-01"), 
                          by = "5 years"),
             labels = date_format("%Y")) +

scale_y_continuous(breaks = seq(0, 400000, by = 40000),
                   labels = label_number(accuracy = 1, 
                                         big.mark = ",")) +

labs(
    title = "Median sales prices of houses sold in the United States",
    subtitle = "Not seasonally adjusted",
    x = NULL,
    y = "Dollars",
    caption = "Source: Census; HUD"
) +

theme(
    plot.title.position = "plot",
    panel.grid.minor.x = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.y = element_blank()
)

recessions <- recessions |>
    mutate(
        is_recess = if_else(Peak >= as.Date("1963-01-01", 
                                            format = "%Y-%m-%d") & 
                            Trough <= as.Date("2021-04-01", 
                                              format = "%Y-%m-%d"),
                            TRUE, FALSE)
    ) |>
    filter(is_recess == TRUE)
glimpse(recessions)
Rows: 8
Columns: 3
$ Peak      <date> 1969-12-01, 1973-11-01, 1980-01-01, 1981-07-…
$ Trough    <date> 1970-11-01, 1975-03-01, 1980-07-01, 1982-11-…
$ is_recess <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
ggplot(data = median_housing, 
       aes(x = date, 
           y = price)) +

geom_rect(
    data = recessions,
    aes(
        xmin = as.Date(Peak), 
        xmax = as.Date(Trough),
        ymin = -Inf, 
        ymax = Inf,
        y = NULL,
        x = NULL
    ), 
    fill = "cornsilk3"
    ) +

geom_line(color = "darkblue") +
scale_x_date(breaks = seq(as.Date("1960-01-01"), 
                          as.Date("2020-01-01"), 
                          by = "5 years"),
             labels = date_format("%Y")) +

scale_y_continuous(breaks = seq(0, 400000, by = 40000),
                   labels = label_number(accuracy = 1, big.mark = ",")) +

labs(
    title = "Median sales prices of houses sold in the United States",
    subtitle = "Not seasonally adjusted",
    x = NULL,
    y = "Dollars",
    caption = "Shaded areas indicate U.S. recessions\nSource: Census; HUD"
) +

theme(
    plot.title.position = "plot",
    panel.grid.minor.x = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.y = element_blank()
)

quarters <- median_housing |>
    mutate(
        year = year(date),
        quarter = paste0("Q", quarter(date)),
        ) |>
    arrange(date) |>
    filter(year %in% c(2019, 2020)) |> glimpse()
Rows: 8
Columns: 4
$ date    <date> 2019-01-01, 2019-04-01, 2019-07-01, 2019-10-01…
$ price   <dbl> 313000, 322500, 318400, 327100, 329000, 322600,…
$ year    <dbl> 2019, 2019, 2019, 2019, 2020, 2020, 2020, 2020
$ quarter <chr> "Q1", "Q2", "Q3", "Q4", "Q1", "Q2", "Q3", "Q4"
ggplot(data = quarters, 
       aes(x = date, 
           y = price,
          group = 1)) +

geom_line(color = "darkblue") +
geom_point(color = "darkblue", 
           size = 2, 
           shape = 21, 
           fill = "white") +

scale_y_continuous(breaks = seq(300000, 360000, by = 20000),
                   labels = label_comma()) +

scale_x_date(breaks = quarters$date,
             labels = quarters$quarter,
             expand = c(0.008, 0.008)) +

annotate("text", x = as.Date("2019-05-15"), y = 285000, label = "2019", size = 4) +
annotate("text", x = as.Date("2020-05-15"), y = 285000, label = "2020", size = 4) +

labs(
    title = "Median sales prices of houses sold in the United States",
    subtitle = "Not seasonally adjusted",
    x = NULL,
    y = "Dollars",
) +

theme(
    plot.title.position = "plot",
    panel.grid.minor.x = element_blank(),
    plot.margin = unit(c(1, 1, 2, 1), "lines"),
    axis.text.x = element_text(size = 8)
) +

coord_cartesian(ylim = c(300000, 360000), clip = "off")

4 - Expect More. Plot More.

Found a method for plotting circles in the library ggforce with geom_circle (geomnet Development Team 2023).

target_data <- tibble(
    origin_x = 0,
    origin_y = 0,
    circle_radius = c(3, 2, 1),
    colors = c("red2", "white", "red2")
)
ggplot() +
    geom_circle(data = target_data, 
                aes(x0 = origin_x, 
                    y0 = origin_y, 
                    r = circle_radius[1], 
                    fill = colors[1]), 
                color = 'white', 
                size = 0.5) +

    geom_circle(data = target_data, 
                aes(x0 = origin_x, 
                    y0 = origin_y, 
                    r = circle_radius[2], 
                    fill = colors[2]), 
                color = 'white', 
                size = 0.5) +

    geom_circle(data = target_data, 
                aes(x0 = origin_x, 
                    y0 = origin_y, 
                    r = circle_radius[3], 
                    fill = colors[3]), 
                color = 'white', 
                size = 0.5) +

    scale_fill_identity() +
    geom_text(aes(x = 0, 
                  y = -4, 
                  label = "TARGET"), 
              size = 9, 
              fontface = "bold", 
              color="red2") +

    geom_text(aes(x = 2, 
                  y = -4.25, 
                  label = "®"), 
              size = 7, 
              fontface = "bold", 
              color="red2") +
    
    coord_fixed(ratio = 1) +
    theme_void() +
    theme(plot.margin = margin(20, 20, 40, 20))

5 - Mirror, mirror on the wall, who’s the ugliest of them all?

geomnet Development Team. 2023. Geom_circle: Add Circles to a Plot. https://rdrr.io/cran/geomnet/man/geom_circle.html.